Micro-level data for the narchiver

Nick Janetos

January 9th, 2016

This document containts scratch work on viewing micro-level data in the narchiver.

In [9]:
# Imports and convenient function
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Load all the data
exec(open("load_data.py").read())
%matplotlib inline

Top 30 vendors, listings

In [12]:
# Count the number of reviews for each vendor
for v in vendors['ID']:
    vendors.set_value(vendors['ID'] == v, 'NUM_REVIEWS', len(reviews[reviews['VENDOR'] == v]))

# Count the number of reviews for each listing
for l in listings['ID']:
    listings.set_value(listings['ID'] == l, 'NUM_REVIEWS', len(reviews[reviews['LISTING'] == l]))
In [29]:
top_vendors = vendors.sort_values('NUM_REVIEWS', ascending = False)[0:29]
print(top_vendors)
        ID                  NAME  NUM_REVIEWS
794    844             Drugs4you         7636
506    535        canadianforger         7419
715    756         cannabis-king         6954
491    519          psychotropix         6925
727    769            lostheaven         6753
394    420         HumboldtFarms         6689
934    998             Markovich         6530
375    401              Discover         6266
1257  1347                  Zues         5982
909    968            rocketship         5770
832    884      ConcentratesKing         5206
417    444               gene001         4892
1180  1263             FMATTHEWS         4561
642    680              Mr.Lewis         4320
1420  1522                 Petch         4091
396    422             RXChemist         4091
897    955       EmeraldTriangle         3989
628    665        thecheekygirls         3918
1022  1091          fredthebaker         3915
814    865            Supersolid         3787
928    992      TheProfessionals         3688
356    381             Evangitis         3564
326    348            medsforyou         3479
275    294                  DrRx         3361
426    453             cali_kush         3272
316    337  CaliforniaGreenCross         3270
374    400           europedrugs         3226
430    457            Dimercurio         3172
556    589                Amsint         3141

Price history, top 10 vendors

Displays, in sequence, for each of the 10 ten vendors (as measured by reviews left)

  1. The ratings path of the vendor.
  2. The recorded min/max sales of that vendor.
  3. The estimates reviews left per day for that vendor.
  4. The (normalized) prices offered by that vendor, by category.
In [30]:
for vendor_id in top_vendors['ID']:
    
    listings_all = listings[listings['VENDOR'] == vendor_id]
    listings_all = listings_all.sort_values('NUM_REVIEWS', ascending = False)
    listings_all = listings_all[0:min(4, len(listings_all))]
    prices_all = prices[prices['VENDOR'] == vendor_id]
    num_listings = len(listings_all)

    unique_categories = len(set(listings_all['CATEGORY']))

    fig, axes = plt.subplots(nrows = unique_categories + 3, ncols = 1, figsize = (16, (unique_categories + 3)*6))

    dateval = prices_all[['DATE', 'RATING']]
    dateval = dateval.sort_values('DATE', ascending = True)
    vals = [r for r in dateval['RATING']]
    dates = dateval['DATE']*86400
    dates  = [datetime.datetime.fromtimestamp(d) for d in dates]
    axes[0].plot(dates, vals, linewidth = 3)
    axes[0].fmt_xdata = mdates.DateFormatter('%m-%d')
    axes[0].set_title("Path of ratings for vendor " + vendors[vendors['ID'] == vendor_id]['NAME'])
    plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=25)
    axes[0].set_xlim(min_date, max_date)
    axes[0].set_ylim(4.8, 5)

    dateval = prices_all[['DATE', 'MIN_SALES']]
    dateval = dateval.sort_values('DATE', ascending = True)
    vals = [r for r in dateval['MIN_SALES']]
    dates = dateval['DATE']*86400
    dates  = [datetime.datetime.fromtimestamp(d) for d in dates]
    axes[1].plot(dates, vals, linewidth = 3)
    dateval = prices_all[['DATE', 'MAX_SALES']]
    dateval = dateval.sort_values('DATE', ascending = True)
    vals = [r for r in dateval['MAX_SALES']]
    dates = dateval['DATE']*86400
    dates  = [datetime.datetime.fromtimestamp(d) for d in dates]
    axes[1].plot(dates, vals, linewidth = 3)
    axes[1].fmt_xdata = mdates.DateFormatter('%m-%d')
    axes[1].set_title("Min / max sales")
    plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=25)
    axes[1].set_xlim(min_date, max_date)
    
    dateval = prices_all[['DATE', 'V_REVIEWS_PER_DAY']]
    dateval = dateval.sort_values('DATE', ascending = True)
    vals = [r for r in dateval['V_REVIEWS_PER_DAY']]
    dates = dateval['DATE']*86400
    dates  = [datetime.datetime.fromtimestamp(d) for d in dates]
    axes[2].plot(dates, vals, linewidth = 3)
    axes[2].fmt_xdata = mdates.DateFormatter('%m-%d')
    axes[2].set_title("Estimated daily review rate")
    plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=25)
    axes[2].set_xlim(min_date, max_date)

    i = 3
    for c in set(listings_all['CATEGORY']):

        ax = axes[i].twinx()
        axes[i].ticklabel_format(useOffset = False)
        ax.ticklabel_format(useOffset = False)
        
        for l in listings_all[listings_all['CATEGORY'] == c]['ID']:
            normalized = prices_all[prices_all['LISTING'] == l]
            vals = [around(p, decimals = 4) for p in normalized['NORMALIZED']]
            dates = normalized['DATE']*86400
            dates  = [datetime.datetime.fromtimestamp(d) for d in dates]
            axes[i].plot(dates, vals, label = "label", linewidth = 2)
            vals = [p for p in normalized['REVIEWS_PER_DAY']]
            ax.plot(dates, vals, label = "label", linewidth = 2, linestyle = "dashed")
            

        axes[i].fmt_xdata = mdates.DateFormatter('%m-%d')
        axes[i].set_title(categories[c - 1][0])
        plt.setp(axes[i].xaxis.get_majorticklabels(), rotation=25)
        axes[i].set_xlim(min_date, max_date)
        axes[i].margins(y = 0.1)
        i = i + 1
/usr/local/lib/python3.4/dist-packages/matplotlib/pyplot.py:516: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)